Exercise 1: American Airlines Employees¶

In [ ]:
# loading libraries

library(tibble)
library(dplyr)
library(ggplot2)
library(ggpubr)
library(scales)
library(lubridate)
In [ ]:
# plot size

std_width  <- 14
std_height <- 9
options(repr.plot.width = std_width, repr.plot.height = std_height)
In [ ]:
# colors for plot

color_palette <- c( "#58508d",
                    "#bc5090",
                    "#ff6361",
                    "#ffa600" )

1) Read and import the data¶

In [ ]:
# import data 

american <- read.table( "american_airline_empl.txt", header = T, sep = "\t" )
delta    <- read.table( "delta_airline_empl.txt",    header = T, sep = "\t" )
fedex    <- read.table( "federal_express_empl.txt",  header = T, sep = "\t" )
united   <- read.table( "united_airline_empl.txt",   header = T, sep = "\t" )

2) Create a common tibble¶

In [ ]:
# add airline column to each dataset

american <- american %>% add_column(Company = "American Airlines")
delta <- delta %>% add_column(Company = "Delta Airlines")
fedex <- fedex %>% add_column(Company = "Federal Express Airlines")
united <- united %>% add_column(Company = "United Airlines")
In [ ]:
# create unique tibble

data <- rbind(american, delta, fedex, united)
data <- as_tibble(data)
head(data)
A tibble: 6 × 6
MonthYearFull.timePart.timeGrand.TotalCompany
<int><int><chr><chr><chr><chr>
1199068,1379,03977,176American Airlines
2199068,7259,27377,998American Airlines
3199069,5099,37678,885American Airlines
4199069,7139,32679,039American Airlines
5199070,3769,30979,685American Airlines
6199071,2589,36980,627American Airlines
In [ ]:
# adjusting tibble (removing commas)

data <- data %>% transmute( 
                Month = Month,
                Year = Year,
                FT = as.numeric(sub(",", "", Full.time, fixed = TRUE)),
                PT = as.numeric(sub(",", "", Part.time, fixed = TRUE)),
                Tot = as.numeric(sub(",", "", Grand.Total, fixed = TRUE)),
                Company = Company )

3) Plot of the employees as a function of time for all companies¶

In [ ]:
# calculating number of employees annual mean for each company 

dataM <- data %>% group_by(Year, Company)
# filtering out year 2023 because in this case we don't need it
dataM <- filter(dataM, Year != 2023) 
dataM <- dataM %>% summarise(FT = mean(FT), PT = mean(PT), Tot = mean(Tot))
`summarise()` has grouped output by 'Year'. You can override using the
`.groups` argument.
In [ ]:
head(dataM)
A grouped_df: 6 × 5
YearCompanyFTPTTot
<int><chr><dbl><dbl><dbl>
1990American Airlines 71252.75 9463.41780716.17
1990Delta Airlines 57445.75 4301.50061747.25
1990Federal Express Airlines64834.2523565.25088399.50
1990United Airlines 67644.58 5690.66773335.25
1991American Airlines 77587.9211084.58388672.50
1991Delta Airlines 62714.42 4951.00067665.42
In [ ]:
# full-time plot

gFT <- ggplot( dataM, aes(x = Year, y = FT, color=Company) ) +
       geom_point( size = 3 ) +
       geom_line( linewidth = 1 )

gFT <- gFT + 
       theme_bw() +
       theme( legend.position = "right",
        plot.title = element_text(size = 26, hjust = 0.5),
        axis.title = element_text(size = 20),
        axis.text  = element_text(size = 18),
        legend.title = element_text(size = 20, hjust = 0.5),
        legend.text  = element_text(size = 18),
        panel.grid.major.y = element_line( color = alpha("black",0.2),
                                           linewidth = 0.3,
                                           linetype = 2 ),
        panel.grid.major.x = element_line( color = alpha("black",0.2),
                                          linewidth = 0.3,
                                          linetype = 2 ),
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        legend.background = element_rect( fill = "white",
                                          linewidth = 0.5, 
                                          linetype = "solid", 
                                          colour = "grey" ) ) +
        scale_color_manual(values = color_palette) +
        labs( x = "Year", 
              y = "Full-Time Employees", 
              title = "Full-Time Employees Annual Averages" )

gFT <- gFT +
        scale_x_continuous(breaks = seq(1990, 2024, by = 5)) + 
        scale_y_continuous(n.breaks = 7)
In [ ]:
# part-time plot

gPT <- ggplot( dataM, aes(x = Year, y = PT, color=Company) ) +
       geom_point( size = 3 ) +
       geom_line( linewidth = 1 )

gPT <- gPT + 
       theme_bw() +
       theme( legend.position = "right",
        plot.title = element_text(size = 26, hjust = 0.5),
        axis.title = element_text(size = 20),
        axis.text  = element_text(size = 18),
        legend.title = element_text(size = 20, hjust = 0.5),
        legend.text  = element_text(size = 18),
        panel.grid.major.y = element_line( color = alpha("black",0.2),
                                           size = 0.3,
                                           linetype = 2 ),
        panel.grid.major.x = element_line( color = alpha("black",0.2),
                                          size = 0.3,
                                          linetype = 2 ),
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        legend.background = element_rect( fill = "white",
                                          linewidth = 0.5, 
                                          linetype = "solid", 
                                          colour = "grey" ) ) +
        scale_color_manual(values = color_palette) +
        labs( x = "Year", 
              y = "Part-Time Employees", 
              title = "Part-Time Employees Annual Averages" )

gPT <- gPT +
        scale_x_continuous( breaks = seq(1990, 2024, by = 5) ) + 
        scale_y_continuous( breaks = seq(0, 70000, by = 10000) )
In [ ]:
gFT
gPT
In [ ]:
# day by day

dataT <- data %>% group_by(Company)
dataT <- dataT %>% transmute(
                    # days sequence
                    Time = seq( as.Date("1990/01/01"), as.Date("2023/01/01"), "month" ), 
                    FT = FT, 
                    PT = PT,
                    Tot = Tot,
                    Company = Company )
In [ ]:
# plot FT

g3a <- ggplot(dataT, aes(x = Time, y = FT, color = Company)) +
       geom_line(linewidth = 1.5 ) 

g3a <- g3a + 
       theme_bw() +
       theme( legend.position = "right",
        plot.title = element_text(size = 26, hjust = 0.5),
        axis.title = element_text(size = 20),
        axis.text  = element_text(size = 18),
        legend.title = element_text(size = 20, hjust = 0.5),
        legend.text  = element_text(size = 18),
        panel.grid.major.y = element_line( color = alpha("black",0.2),
                                           size = 0.3,
                                           linetype = 2 ),
        panel.grid.major.x = element_line( color = alpha("black",0.2),
                                          size = 0.3,
                                          linetype = 2 ),
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        legend.background = element_rect( fill = "white",
                                          linewidth = 0.5, 
                                          linetype = "solid", 
                                          colour = "grey" ) ) +
        scale_color_manual(values = color_palette) +
        labs( x = "Time", 
              y = "Full-Time Employees", 
              title = "Full-Time Employees in Years 1990 - 2023" )

g3a <- g3a +
        scale_x_date(date_breaks = "3 year", date_labels = "%Y") +
        scale_y_continuous( n.breaks = 9 )
In [ ]:
# plot PT

g3b <- ggplot(dataT, aes(x = Time, y = PT, color = Company)) +
       geom_line(linewidth = 1.5 ) 

g3b <- g3b + 
       theme_bw() +
       theme( legend.position = "right",
        plot.title = element_text(size = 26, hjust = 0.5),
        axis.title = element_text(size = 20),
        axis.text  = element_text(size = 18),
        legend.title = element_text(size = 20, hjust = 0.5),
        legend.text  = element_text(size = 18),
        panel.grid.major.y = element_line( color = alpha("black",0.2),
                                           size = 0.3,
                                           linetype = 2 ),
        panel.grid.major.x = element_line( color = alpha("black",0.2),
                                          size = 0.3,
                                          linetype = 2 ),
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        legend.background = element_rect( fill = "white",
                                          linewidth = 0.5, 
                                          linetype = "solid", 
                                          colour = "grey" ) ) +
        scale_color_manual(values = color_palette) +
        labs( x = "Time", 
              y = "Part-Time Employees", 
              title = "Part-Time Employees in Years 1990 - 2023" )

g3b <- g3b +
        scale_x_date(date_breaks = "3 year", date_labels = "%Y") +
        scale_y_continuous( n.breaks = 9 )
In [ ]:
g3a
g3b

4) Minimum and maximum number of employees for each company¶

In [ ]:
# minimum number of employees

filter(dataT, Tot <= min(Tot))
A grouped_df: 4 × 5
TimeFTPTTotCompany
<date><dbl><dbl><dbl><chr>
2013-09-0155462 682862290American Airlines
2006-11-0141948 446246410Delta Airlines
1990-01-01613052358084885Federal Express Airlines
2011-06-0140522 525945781United Airlines
In [ ]:
# maximum number of employees

filter(dataT, Tot >= max(Tot))
A grouped_df: 4 × 5
TimeFTPTTotCompany
<date><dbl><dbl><dbl><chr>
2018-06-01 9654312628109171American Airlines
2023-01-01 94236 439 94675Delta Airlines
2021-03-0120440665977270383Federal Express Airlines
2001-03-01 9104111005102046United Airlines
In [ ]:
# plot to visualize min and max number of employees for each company
# plot of total number of employees trend over years for each company

gET <- ggplot( dataT, aes(x = Time, y = Tot, color = Company) ) +
       geom_line( linewidth = 1.3 )

gET <- gET + 
       theme_bw() +
       theme( legend.position = "right",
        plot.title = element_text(size = 26, hjust = 0.5),
        axis.title = element_text(size = 20),
        axis.text  = element_text(size = 18),
        legend.title = element_text(size = 20, hjust = 0.5),
        legend.text  = element_text(size = 18),
        panel.grid.major.y = element_line( color = alpha("black",0.2),
                                           size = 0.3,
                                           linetype = 2 ),
        panel.grid.major.x = element_line( color = alpha("black",0.2),
                                          size = 0.3,
                                          linetype = 2 ),
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        legend.background = element_rect( fill = "white",
                                          linewidth = 0.5, 
                                          linetype = "solid", 
                                          colour = "grey" ) ) +
        scale_color_manual(values = color_palette) +
        labs( x = "Time", 
              y = "Total Employees", 
              title = "Total Employees in Years 1990 - 2023" )

gET <- gET +
        scale_x_date(date_breaks = "3 year", date_labels = "%Y") +
        scale_y_continuous( n.breaks = 9 )
In [ ]:
gET

5) Plot of the fraction of PT worker over TOT employees as a function of time¶

In [ ]:
# calculating PT/TOT fraction

dataF <- dataT %>% transmute( Time = Time,
                              PTF = PT / Tot,
                              Company = Company)
In [ ]:
# plot

gPTF <- ggplot( dataF, aes(x = Time, y = PTF, color = Company) ) +
       geom_line( linewidth = 1.3 )

gPTF <- gPTF + 
       theme_bw() +
       theme( legend.position = "right",
        plot.title = element_text(size = 26, hjust = 0.5),
        axis.title = element_text(size = 20),
        axis.text  = element_text(size = 18),
        legend.title = element_text(size = 20, hjust = 0.5),
        legend.text  = element_text(size = 18),
        panel.grid.major.y = element_line( color = alpha("black",0.2),
                                           size = 0.3,
                                           linetype = 2 ),
        panel.grid.major.x = element_line( color = alpha("black",0.2),
                                          size = 0.3,
                                          linetype = 2 ),
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        legend.background = element_rect( fill = "white",
                                          linewidth = 0.5, 
                                          linetype = "solid", 
                                          colour = "grey" ) ) +
        scale_color_manual(values = color_palette) +
        labs( x = "Time", 
              y = "% Part-Time Employees", 
              title = "Part-Time employees on Total Employees over Time" )

gPTF <- gPTF +
        scale_x_date(date_breaks = "5 years", date_labels = "%Y") +
        scale_y_continuous(labels = scales::percent)
In [ ]:
gPTF

6) Influence of pandemic: trend in years 2019 - 2023¶

In [ ]:
# analyzing years 2019-2023
dataP <- filter(dataT, Time >= "2019/01/01")
In [ ]:
# plot

gTP <- ggplot( dataP, aes(x = Time, y = Tot, color = Company) ) +
       geom_point( size = 3 ) +
       geom_line( linewidth = 1 )

gTP <- gTP + 
       theme_bw() +
       theme( legend.position = "right",
        plot.title = element_text(size = 26, hjust = 0.5),
        axis.title = element_text(size = 20),
        axis.text  = element_text(size = 16),
        axis.text.x=element_text(angle=30, hjust=1),
        legend.title = element_text(size = 20, hjust = 0.5),
        legend.text  = element_text(size = 18),
        panel.grid.major.y = element_line( color = alpha("black",0.2),
                                           size = 0.3,
                                           linetype = 2 ),
        panel.grid.major.x = element_line( color = alpha("black",0.2),
                                          size = 0.3,
                                          linetype = 2 ),
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        legend.background = element_rect( fill = "white",
                                          linewidth = 0.5, 
                                          linetype = "solid", 
                                          colour = "grey" ) ) +
        scale_color_manual(values = color_palette) +
        labs( x = "Time", 
              y = "Total Employees", 
              title = "Total Employees in Years 2019 - 2023" )

gTP <- gTP +
        scale_x_date(date_breaks = "6 month", date_labels = "%Y %b") +
        scale_y_continuous( n.breaks = 8 )
In [ ]:
gTP

Apart from FedEx Airlines, a similar trend during years 2019 - 2023 is evident for the other three.

Globally, with the outbreak of the pandemic, from March 2020 the number of employees begins to decline to a relative minimum and then steadily rise again, returning, in January 2023, to about pre-pandemic levels. Specifically, Delta Airlines is the first company to reach the relative minimum, nearly halving the number of employees. In contrast, American Airlines and United Airlines follow a very similar trend, reaching a relative minimum of total employees in October 2020 and then slowly increasing. Moreover, the relative lows reached during the pandemic are not as low as the lows reached due to the effects of the 2007-2008 financial crisis.

These trends show that the pandemic has affected the number of total employees for the airlines analyzed (apart from FedEx). In fact, the number of total employees declined following the outbreak of the pandemic, reaching a low more or less in a short period of time, and then rising steadily until today.

As for FedEx Airlines, on the other hand, the trend is completely different, overall increasing since 1990. Moreover, for this company, the number of total employees increases rapidly from the outbreak of pandemic to the end of 2020.


Exercise 2: Data Frames and Tibble¶

  • nycflights13 package
In [ ]:
library(nycflights13)

1.1) Total number of flights departed from each of the three NYC airports as a function of time (one entry for each of the 365 days of the year)¶

In [ ]:
head(flights)
A tibble: 6 × 19
yearmonthdaydep_timesched_dep_timedep_delayarr_timesched_arr_timearr_delaycarrierflighttailnumorigindestair_timedistancehourminutetime_hour
<int><int><int><int><int><dbl><int><int><dbl><chr><int><chr><chr><chr><dbl><dbl><dbl><dbl><dttm>
201311517515 2 830 819 11UA1545N14228EWRIAH22714005152013-01-01 05:00:00
201311533529 4 850 830 20UA1714N24211LGAIAH22714165292013-01-01 05:00:00
201311542540 2 923 850 33AA1141N619AAJFKMIA16010895402013-01-01 05:00:00
201311544545-110041022-18B6 725N804JBJFKBQN18315765452013-01-01 05:00:00
201311554600-6 812 837-25DL 461N668DNLGAATL116 7626 02013-01-01 06:00:00
201311554558-4 740 728 12UA1696N39463EWRORD150 7195582013-01-01 05:00:00
In [ ]:
fNYC <- flights %>% group_by(year, month, day, origin)
fNYC <- fNYC %>% summarise(tot = length(flight))
fNYC <- fNYC %>% transmute( date = paste(year, month, day, sep = "-"),
                            origin = origin,
                            tot = tot )
fNYC <- fNYC %>% transmute( date = as.Date(date),
                            Airport = origin,
                            tot = tot )
head(fNYC)
`summarise()` has grouped output by 'year', 'month', 'day'. You can override
using the `.groups` argument.
A grouped_df: 6 × 6
yearmonthdaydateAirporttot
<int><int><int><date><chr><int>
2013112013-01-01EWR305
2013112013-01-01JFK297
2013112013-01-01LGA240
2013122013-01-02EWR350
2013122013-01-02JFK321
2013122013-01-02LGA272
In [ ]:
# plot

g <- ggplot( fNYC, aes( x = date, y = tot, color = Airport) ) + 
    geom_line( linewidth = 0.7) + 
    geom_point( size = 1)

g <- g +
    theme_bw() + 
    theme( legend.position = "right",
        plot.title = element_text(size = 22, hjust = 0.5),
        axis.title = element_text(size = 18),
        axis.text  = element_text(size = 16),
        legend.title = element_text(size = 20, hjust = 0.5),
        legend.text  = element_text(size = 18),
        panel.grid.major.y = element_line( color = alpha("black",0.2),
                                           linewidth = 0.3,
                                           linetype = 2 ),
        panel.grid.major.x = element_line( color = alpha("black",0.2),
                                           linewidth = 0.3,
                                           linetype = 2 ),
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        legend.background = element_rect( fill = "white",
                                          linewidth = 0.5, 
                                          linetype = "solid", 
                                          colour = "grey" ) ) +
        scale_color_manual(values = color_palette) +
        labs( x = "Year", 
              y = "Total Number of Flights", 
              title = "Total Number of Flights from each of the three NYC airports during 2013" )

g <- g +
        scale_x_date(date_breaks = "1 month", date_labels = "%b") +
        scale_y_continuous( n.breaks = 10 )
In [ ]:
g

1.2.1) Average number of flights computed over the first five working days of each week as a function of the week number of the year¶

In [ ]:
fNYC <- fNYC %>% transmute( week = isoweek(ymd(date)),
                            date = date,
                            Airport = Airport,
                            tot = tot )
fNYC <- fNYC %>% transmute( week = week,
                            dow = wday(date, week_start = 1),
                            date = date,
                            Airport = Airport,
                            tot = tot )
head(fNYC)
A grouped_df: 6 × 8
yearmonthdayweekdowdateAirporttot
<int><int><int><dbl><dbl><date><chr><int>
201311122013-01-01EWR305
201311122013-01-01JFK297
201311122013-01-01LGA240
201312132013-01-02EWR350
201312132013-01-02JFK321
201312132013-01-02LGA272
In [ ]:
# average over first 5 working days

fNYCwd <- filter(fNYC, dow <=5)
fNYCwd <- fNYCwd %>% group_by(week, Airport)
fNYCwd <- fNYCwd %>% transmute( week = week,
                            dow = dow,
                            date = date,
                            Airport = Airport,
                            tot = mean(tot) )
head(fNYCwd)
A grouped_df: 6 × 5
weekdowdateAirporttot
<dbl><dbl><date><chr><dbl>
122013-01-01EWR325.6667
122013-01-01JFK308.1667
122013-01-01LGA259.1667
132013-01-02EWR325.6667
132013-01-02JFK308.1667
132013-01-02LGA259.1667
In [ ]:
# plot 

gwd <- ggplot(fNYCwd, aes( x = week, y = tot, color = Airport) ) + 
        geom_line( linewidth = 1 ) + 
        geom_point( size = 1.6 )

gwd <- gwd +
    theme_bw() + 
    theme( legend.position = "right",
        plot.title = element_text(size = 22, hjust = 0.5),
        axis.title = element_text(size = 18),
        axis.text  = element_text(size = 16),
        legend.title = element_text(size = 20, hjust = 0.5),
        legend.text  = element_text(size = 18),
        panel.grid.major.y = element_line( color = alpha("black",0.2),
                                           linewidth = 0.3,
                                           linetype = 2 ),
        panel.grid.major.x = element_line( color = alpha("black",0.2),
                                           linewidth = 0.3,
                                           linetype = 2 ),
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        legend.background = element_rect( fill = "white",
                                          linewidth = 0.5, 
                                          linetype = "solid", 
                                          colour = "grey" ) ) +
        scale_color_manual(values = color_palette) +
        labs( x = "Week", 
              y = "Average over Working Days", 
              title = "Average Number of Flights over Working Days from NYC Airports in 2013" )

gwd <- gwd +
        scale_x_continuous( n.breaks = 10 ) +
        scale_y_continuous( n.breaks = 10 )
In [ ]:
gwd
In [ ]:
# average on we

fNYCwe <- filter(fNYC, dow > 5)
fNYCwe <- fNYCwe %>% group_by(week, Airport)
fNYCwe <- fNYCwe %>% transmute( week = week,
                            dow = dow,
                            date = date,
                            Airport = Airport,
                            tot = mean(tot) )
head(fNYCwe)
A grouped_df: 6 × 5
weekdowdateAirporttot
<dbl><dbl><date><chr><dbl>
162013-01-05EWR269.5
162013-01-05JFK304.5
162013-01-05LGA202.0
172013-01-06EWR269.5
172013-01-06JFK304.5
172013-01-06LGA202.0
In [ ]:
# plot 

gwe <- ggplot(fNYCwe, aes( x = week, y = tot, color = Airport) ) + 
        geom_line( linewidth = 1 ) + 
        geom_point( size = 1.6 )

gwe <- gwe +
    theme_bw() + 
    theme( legend.position = "right",
        plot.title = element_text(size = 22, hjust = 0.5),
        axis.title = element_text(size = 18),
        axis.text  = element_text(size = 16),
        legend.title = element_text(size = 20, hjust = 0.5),
        legend.text  = element_text(size = 18),
        panel.grid.major.y = element_line( color = alpha("black",0.2),
                                           linewidth = 0.3,
                                           linetype = 2 ),
        panel.grid.major.x = element_line( color = alpha("black",0.2),
                                           linewidth = 0.3,
                                           linetype = 2 ),
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        legend.background = element_rect( fill = "white",
                                          linewidth = 0.5, 
                                          linetype = "solid", 
                                          colour = "grey" ) ) +
        scale_color_manual(values = color_palette) +
        labs( x = "Week", 
              y = "Average over Weekend", 
              title = "Average Number of Flights over Weekend from NYC Airports in 2013" )

gwe <- gwe +
        scale_x_continuous( n.breaks = 10 ) +
        scale_y_continuous( n.breaks = 10 )
In [ ]:
gwe

2) Departure delay¶

In [ ]:
dd <- flights %>% group_by(year, month, day, origin)
dd <- dd %>% transmute( date = paste(year, month, day, sep = "-"),
                        dep_delay = dep_delay,
                        flight = flight )
dd <- dd %>% transmute( date = as.Date(date),
                        depDelay = dep_delay,
                        flight = flight,
                        Airport = origin )

dd <- dd %>% ungroup()
#  filtering out NA in depDelay
dd <- dd %>% filter(!is.na(depDelay)) 

head(dd)
A tibble: 6 × 8
yearmonthdayorigindatedepDelayflightAirport
<int><int><int><chr><date><dbl><int><chr>
201311EWR2013-01-01 21545EWR
201311LGA2013-01-01 41714LGA
201311JFK2013-01-01 21141JFK
201311JFK2013-01-01-1 725JFK
201311LGA2013-01-01-6 461LGA
201311EWR2013-01-01-41696EWR
In [ ]:
# filter by airport
dEWR <- dd %>% filter(origin == "EWR")
dLGA <- dd %>% filter(origin == "LGA")
dJFK <- dd %>% filter(origin == "JFK")
In [ ]:
# minimum delays --> departure ahead of time

# group by date
dEWR <- dEWR %>% group_by(date)
dLGA <- dLGA %>% group_by(date)
dJFK <- dJFK %>% group_by(date)

dEWRmin <- filter(dEWR, depDelay <= min(depDelay))
dLGAmin <- filter(dLGA, depDelay <= min(depDelay))
dJFKmin <- filter(dJFK, depDelay <= min(depDelay))
In [ ]:
# maximum delays

dEWRmax <- filter(dEWR, depDelay >= max(depDelay))
dLGAmax <- filter(dLGA, depDelay >= max(depDelay))
dJFKmax <- filter(dJFK, depDelay >= max(depDelay))
In [ ]:
# average 

dEWRmean <- dEWR %>% summarise( meanDelay = mean(depDelay))
dLGAmean <- dLGA %>% summarise( meanDelay = mean(depDelay))
dJFKmean <- dJFK %>% summarise( meanDelay = mean(depDelay))
In [ ]:
# plot for each airport

gEWR <- ggplot() + 
        geom_line(data = dEWRmean, aes(x = date, y = meanDelay, color = "Mean"))

gEWR <- gEWR + 
        geom_line(data = dEWRmax, aes(x = date, y = depDelay, color = "Maximum"))

gEWR <- gEWR +  
        geom_line(data = dEWRmin, aes(x = date, y = depDelay, color = "Minimum"))

gEWR <- gEWR + 
        theme_bw() + 
        theme( legend.position = "right",
        plot.title = element_text(size = 22, hjust = 0.5),
        axis.title = element_text(size = 18),
        axis.text  = element_text(size = 16),
        legend.title = element_text(size = 20, hjust = 0.5),
        legend.text  = element_text(size = 18),
        panel.grid.major.y = element_line( color = alpha("black",0.2),
                                           linewidth = 0.3,
                                           linetype = 2 ),
        panel.grid.major.x = element_line( color = alpha("black",0.2),
                                           linewidth = 0.3,
                                           linetype = 2 ),
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        legend.background = element_rect( fill = "white",
                                          linewidth = 0.5, 
                                          linetype = "solid", 
                                          colour = "grey" ) ) +
        scale_color_manual(values = color_palette) +
        labs( x = "Days", 
              y = "Delay (min/day)", 
              title = "Departure Delay per Day for Flights from EWR Airport in NYC (2013)" )

gEWR <- gEWR +
        scale_x_date(date_breaks = "2 month", date_labels = "%b-%d") +
        scale_y_continuous( n.breaks = 8 ) +
        labs(color='Delay')
In [ ]:
# plot LGA

gLGA <- ggplot() + 
        geom_line(data = dLGAmean, aes(x = date, y = meanDelay, color = "Mean"))

gLGA <- gLGA + 
        geom_line(data = dLGAmax, aes(x = date, y = depDelay, color = "Maximum"))

gLGA <- gLGA +  
        geom_line(data = dLGAmin, aes(x = date, y = depDelay, color = "Minimum"))

gLGA <- gLGA + 
        theme_bw() + 
        theme( legend.position = "right",
        plot.title = element_text(size = 22, hjust = 0.5),
        axis.title = element_text(size = 18),
        axis.text  = element_text(size = 16),
        legend.title = element_text(size = 20, hjust = 0.5),
        legend.text  = element_text(size = 18),
        panel.grid.major.y = element_line( color = alpha("black",0.2),
                                           linewidth = 0.3,
                                           linetype = 2 ),
        panel.grid.major.x = element_line( color = alpha("black",0.2),
                                           linewidth = 0.3,
                                           linetype = 2 ),
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        legend.background = element_rect( fill = "white",
                                          linewidth = 0.5, 
                                          linetype = "solid", 
                                          colour = "grey" ) ) +
        scale_color_manual(values = color_palette) +
        labs( x = "Days", 
              y = "Delay (min/day)", 
              title = "Departure Delay per Day for Flights from LGA Airport in NYC (2013)" )

gLGA <- gLGA +
        scale_x_date(date_breaks = "2 month", date_labels = "%b-%d") +
        scale_y_continuous( n.breaks = 8 ) +
        labs(color='Delay')
In [ ]:
# plot JFK

gJFK <- ggplot() + 
        geom_line(data = dJFKmean, aes(x = date, y = meanDelay, color = "Mean"))

gJFK <- gJFK + 
        geom_line(data = dJFKmax, aes(x = date, y = depDelay, color = "Maximum"))

gJFK <- gJFK +  
        geom_line(data = dJFKmin, aes(x = date, y = depDelay, color = "Minimum"))

gJFK <- gJFK + 
        theme_bw() + 
        theme( legend.position = "right",
        plot.title = element_text(size = 22, hjust = 0.5),
        axis.title = element_text(size = 18),
        axis.text  = element_text(size = 16),
        legend.title = element_text(size = 20, hjust = 0.5),
        legend.text  = element_text(size = 18),
        panel.grid.major.y = element_line( color = alpha("black",0.2),
                                           linewidth = 0.3,
                                           linetype = 2 ),
        panel.grid.major.x = element_line( color = alpha("black",0.2),
                                           linewidth = 0.3,
                                           linetype = 2 ),
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        legend.background = element_rect( fill = "white",
                                          linewidth = 0.5, 
                                          linetype = "solid", 
                                          colour = "grey" ) ) +
        scale_color_manual(values = color_palette) +
        labs( x = "Days", 
              y = "Delay (min/day)", 
              title = "Departure Delay per Day for Flights from JFK Airport in NYC (2013)" )

gJFK <- gJFK +
        scale_x_date(date_breaks = "2 month", date_labels = "%b-%d") +
        scale_y_continuous( n.breaks = 8 ) +
        labs(color='Delay')
In [ ]:
gEWR
gLGA
gJFK

3) Average speed of each plane¶

In [ ]:
s <- flights %>% transmute( date = paste(year, month, day, sep = "-"),
                            duration = air_time / 60, # hours
                            distance = distance * 1.609,
                            plane = tailnum) # km

s <- s %>% transmute( date = as.Date(date),
                      avSpeed = distance / duration,
                      plane = plane)
s <- s %>% filter(!is.na(avSpeed))

s <- s %>% group_by(date)
s <- s %>% summarise(mean = mean(avSpeed))
head(s)
A tibble: 6 × 2
datemean
<date><dbl>
2013-01-01575.0771
2013-01-02587.5402
2013-01-03594.9819
2013-01-04616.8570
2013-01-05604.2606
2013-01-06593.3873
In [ ]:
# plot

ps <- ggplot(s, aes( x = date, y = mean) ) + 
      geom_line( linewidth = 0.8, color = "#58508d") 

ps <- ps +
        theme_bw() + 
        theme( plot.title = element_text(size = 22, hjust = 0.5),
        axis.title = element_text(size = 18),
        axis.text  = element_text(size = 16),
        legend.title = element_text(size = 20, hjust = 0.5),
        legend.text  = element_text(size = 18),
        panel.grid.major.y = element_line( color = alpha("black",0.2),
                                           linewidth = 0.3,
                                           linetype = 2 ),
        panel.grid.major.x = element_line( color = alpha("black",0.2),
                                           linewidth = 0.3,
                                           linetype = 2 ),
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        legend.background = element_rect( fill = "white",
                                          linewidth = 0.5, 
                                          linetype = "solid", 
                                          colour = "grey" ) ) +
        labs( x = "Days", 
              y = "Average Plane Speed (km/h)", 
              title = "Average Plane Speed per Day for Flights from NYC Airports in 2013" )

ps <- ps + 
      scale_x_date(date_breaks = "2 month", date_labels = "%b-%d") +
      scale_y_continuous( n.breaks = 8 )
In [ ]:
ps

4.1) Airline companies offering the largest two numbers of flights per day and per week¶

In [ ]:
w <- flights %>% transmute( date = paste(year, month, day, sep = "-"),
                            flight = flight,
                            Airline = carrier)
w <- w %>% transmute( date = as.Date(date),
                      week = isoweek(ymd(date)),
                      flight = flight,
                      Airline = Airline)

# per day

wd <- w %>% group_by(date, Airline)
wd <- wd %>% transmute( date = date,
                        totDay = length(flight),
                        Airline = Airline)

# finding the largest two numbers

wd <- filter(wd, totDay >= max(totDay))
wd <- wd %>% slice(1:1)
wd <- wd %>% ungroup()
wd <- wd %>% arrange(desc(totDay)) %>% group_by(date) %>% slice(1:2)

head(wd)
A grouped_df: 6 × 3
datetotDayAirline
<date><int><chr>
2013-01-01165UA
2013-01-01163B6
2013-01-02170UA
2013-01-02162B6
2013-01-03162B6
2013-01-03159UA
In [ ]:
# plot

m <- ggplot(wd, aes( x = date, y = totDay, color = Airline) ) + 
      geom_point( size = 2 ) 

m <- m +
        theme_bw() + 
        theme( plot.title = element_text(size = 22, hjust = 0.5),
        axis.title = element_text(size = 18),
        axis.text  = element_text(size = 16),
        legend.title = element_text(size = 20, hjust = 0.5),
        legend.text  = element_text(size = 18),
        panel.grid.major.y = element_line( color = alpha("black",0.2),
                                           linewidth = 0.3,
                                           linetype = 2 ),
        panel.grid.major.x = element_line( color = alpha("black",0.2),
                                           linewidth = 0.3,
                                           linetype = 2 ),
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        legend.background = element_rect( fill = "white",
                                          linewidth = 0.5, 
                                          linetype = "solid", 
                                          colour = "grey" ) ) +
        scale_color_manual(values = color_palette) +
        labs( x = "Days", 
              y = "Number of Flights", 
              title = "Two Largest Numbers of Flights per Day - Flights From NYC Airports, 2013" )

m <- m + 
      scale_x_date(date_breaks = "1 month", date_labels = "%m/%d") +
      scale_y_continuous( n.breaks = 8 )
In [ ]:
m
In [ ]:
# week

ww <- w %>% group_by(week, Airline)
ww <- ww %>% transmute( week = week,
                        totWeek = length(flight),
                        Airline = Airline)

ww <- filter(ww, totWeek >= max(totWeek))
ww <- ww %>% slice(1:1)
ww <- ww %>% ungroup()
ww <- ww %>% arrange(desc(totWeek)) %>% group_by(week) %>% slice(1:2)

head(ww)
A grouped_df: 6 × 3
weektotWeekAirline
<dbl><int><chr>
11284B6
11230UA
21035UA
2 994B6
31032UA
3 970B6
In [ ]:
# plot

mm <- ggplot(ww, aes( x = week, y = totWeek, color = Airline) ) + 
      geom_point( size = 3)

mm <- mm +
        theme_bw() + 
        theme( plot.title = element_text(size = 22, hjust = 0.5),
        axis.title = element_text(size = 18),
        axis.text  = element_text(size = 16),
        legend.title = element_text(size = 20, hjust = 0.5),
        legend.text  = element_text(size = 18),
        panel.grid.major.y = element_line( color = alpha("black",0.2),
                                           linewidth = 0.3,
                                           linetype = 2 ),
        panel.grid.major.x = element_line( color = alpha("black",0.2),
                                           linewidth = 0.3,
                                           linetype = 2 ),
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        legend.background = element_rect( fill = "white",
                                          linewidth = 0.5, 
                                          linetype = "solid", 
                                          colour = "grey" ) ) +
        scale_color_manual(values = color_palette) +
        labs( x = "Week Number", 
              y = "Number of Flights", 
              title = "Two Largest Numbers of Flights per Week - Flights From NYC Airports, 2013" )

mm <- mm + 
      scale_x_continuous( n.breaks = 10 ) +
      scale_y_continuous( n.breaks = 4 )
In [ ]:
mm

4.2) Airline company offering the smallest number of flight per month¶

In [ ]:
t <- flights %>% transmute( month = month.abb[month],
                            flight = flight,
                            Airline = carrier)
t <- t %>% group_by(month, Airline)
t <- t %>% transmute( month = month,
                      totMonth = length(flight),
                      Airline = Airline)
t <- filter(t, totMonth <= min(totMonth))
t <- t %>% slice(1:1)
t <- t %>% ungroup()
t <- t %>% arrange(desc(totMonth)) %>% group_by(month) %>% slice(1:1)
head(t)
A grouped_df: 6 × 3
monthtotMonthAirline
<chr><int><chr>
Apr5047UA
Aug5124UA
Dec4931UA
Feb4346UA
Jan4637UA
Jul5066UA
In [ ]:
# plot

z <- ggplot(t, aes( x = month, y = totMonth, color = Airline) ) + 
      geom_point( size = 4 ) 

z <- z +
        theme_bw() + 
        theme( plot.title = element_text(size = 22, hjust = 0.5),
        axis.title = element_text(size = 18),
        axis.text  = element_text(size = 16),
        legend.title = element_text(size = 20, hjust = 0.5),
        legend.text  = element_text(size = 18),
        panel.grid.major.y = element_line( color = alpha("black",0.2),
                                           linewidth = 0.3,
                                           linetype = 2 ),
        panel.grid.major.x = element_line( color = alpha("black",0.2),
                                           linewidth = 0.3,
                                           linetype = 2 ),
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        legend.background = element_rect( fill = "white",
                                          linewidth = 0.5, 
                                          linetype = "solid", 
                                          colour = "grey" ) ) +
        scale_color_manual(values = color_palette) +
        labs( x = "Month", 
              y = "Number of Flights", 
              title = "Smallest Numbers of Flights per Month - Flights From NYC Airports, 2013" )

z <- z + 
      scale_y_continuous( n.breaks = 7 )
In [ ]:
z

4.3) Airline company offering the longest distance flight per month¶

In [ ]:
q <- flights %>% transmute( month = month.abb[month],
                            Airline = carrier,
                            distance = distance * 1.609) # km
q <- q %>% group_by(month)
q <- filter(q, distance >= max(distance))
q <- q %>% slice(1:1)
q
A grouped_df: 12 × 3
monthAirlinedistance
<chr><chr><dbl>
AprHA8017.647
AugHA8017.647
DecHA8017.647
FebHA8017.647
JanHA8017.647
JulHA8017.647
JunHA8017.647
MarHA8017.647
MayHA8017.647
NovHA8017.647
OctHA8017.647
SepHA8017.647